import spacy
import jsonlines
import random
from tqdm import tqdm

qasc = open("../data/triples/omcs.txt").readlines()
nlp = spacy.load('en_core_web_lg')


import pickle
docmap = {}

def get_doc(docs):
    if docs in docmap:
        return docmap[docs]
    docmap[docs] = nlp(docs)
    return docmap[docs]

def save_maps(fname,mmap):
    with open(fname, 'wb+') as handle:
        pickle.dump(mmap, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
def groups(stream, size):
    batch = []
    for item in stream:
        batch += [item]
        if len(batch) % size == 0:
            yield batch
        batch = []
    if len(batch) > 0:
        yield batch
        
def chunker(seq, size):
    return [seq[pos:pos + size] for pos in range(0, len(seq), size)]
    
docnounmap={}
docverbmap={}

chunks = chunker(qasc,400)

for sentences in tqdm(chunks,desc="Creating Docs"):
    docs = nlp.pipe(sentences,batch_size=400,n_threads=40)
    for doc in docs:
        docnouns=[x.text for x in doc.noun_chunks]
        docverbs=[x.text for x in doc if x.pos_=="VERB"]
#         docmap[doc.text]=doc
        docnounmap[doc.text.strip()]=docnouns
        docverbmap[doc.text.strip()]=docverbs
# save_maps("qasc_all_sents.pickled",docmap)
save_maps("omcsnp_map.pickled",docnounmap)
save_maps("omcsvp_map.pickled",docverbmap)